Conv2d

对输入 Tensor 计算二维卷积,输入的 shape 为 \((N, H_{in}, W_{in}, C_{in})\),其中 \(N\) 为 batch size,\(C\) 为通道数,\(H\) 为特征图的高度,\(W\) 为特征图的宽度。

根据以下公式计算输出:

\[out(N_i, C_{out_j}) = bias(C_{out_j}) + \sum_{k=0}^{C_{in}-1} \text{ccor}(\text{weight}(C_{out_j}, k), X(N_i, k))\]

其中,\(bias\) 为输出偏置,\(\text{ccor}\) 为 cross-correlation 操作,\(weight\) 为卷积核的值,\(X\) 为输入的特征图。

  • \(i\) 对应 batch 数,其范围为 \([0, N-1]\),其中 \(N\) 为输入 batch。

  • \(j\) 对应输出通道,其范围为 \([0, C_{out}-1]\),其中 \(C_{out}\) 为输出通道数,该值也等于卷积核的个数。

  • \(k\) 对应输入通道数,其范围为 \([0, C_{in}-1]\),其中 \(C_{in}\) 为输入通道数,该值也等于卷积核的通道数。

因此,上面的公式中,\(bias(C_{out_j})\) 为第 \(j\) 个输出通道的偏置,\(weight(C_{out_j}, k)\) 表示第 \(j\) 个卷积核在第 \(k\) 个输入通道的卷积核切片,\(X(N_i, k)\) 为特征图第 \(i\) 个 batch 第 \(k\) 个输入通道的切片。卷积核 shape 为 \((\text{kernel_size}[0], \text{kernel_size}[1])\),其中 kernel_size[0] 和 kernel_size[1] 是卷积核的高度和宽度。若考虑到输入输出通道以及 group,则完整卷积核的 shape 为 \((C_{out}, \text{kernel_size}[0], \text{kernel_size}[1], C_{in}/\text{group})\),其中 group 是分组卷积时在通道上分割输入 \(x\) 的组数。

输入:
  • input_x - 输入数据的地址

  • input_w - 输入卷积核权重的地址

  • bias - 输入偏置的地址

  • conv_param - 算子计算所需参数的结构体。其各成员见下述。

  • quant_param - 对int8类型进行量化计算所需参数的结构体。其各成员见下述。

  • core_mask - 核掩码。

ConvParameter及ConvQuantParameter定义:

 1typedef struct ConvParameter {
 2    void* workspace_; // 用于存放中间计算结果
 3    int output_batch_; // 输出数据总批次
 4    int input_batch_; // 输入数据总批次
 5    int input_h_; // 输入数据h维度大小
 6    int input_w_; // 输入数据w维度大小
 7    int output_h_; // 输出数据h维度大小
 8    int output_w_; // 输出数据w维度大小
 9    int input_channel_; // 输入数据通道数
10    int output_channel_; // 输出数据通道数
11    int kernel_h_; // 卷积核h维度大小
12    int kernel_w_; // 卷积核w维度大小
13    int group_; // 组数
14    int pad_l_; // 左填充大小
15    int pad_u_; // 上填充大小
16    int dilation_h_; // 卷积核h维度膨胀尺寸大小
17    int dilation_w_; // 卷积核w维度膨胀尺寸大小
18    int stride_h_; // 卷积核h维度步长
19    int stride_w_; // 卷积核w维度步长
20    int buffer_size_; // 为分块计算所分配的缓存大小
21} ConvParameter;
22
23typedef struct ConvQuantParameter {
24    int32_t* left_shift_;
25    int32_t* right_shift_;
26    int32_t* multiplier_;
27    int32_t* filter_zp_ptr_;
28    int32_t output_zp_;
29    int32_t mini_;
30    int32_t maxi_;
31    int per_channel_;
32} ConvQuantParameter;
输出:
  • out_y - 输出地址。

支持平台:

FT78NE MT7004

备注

  • FT78NE 支持int8, fp32

  • MT7004 支持fp16, fp32

共享存储版本:

void i8_conv2d_s(int8_t *input_x, int8_t *input_w, int8_t *out_y, int *bias, ConvParameter *conv_param, ConvQuantParameter quant_param, int core_mask)
void hp_conv2d_s(half *input_x, half *input_w, half *out_y, half *bias, ConvParameter *conv_param, int core_mask)
void fp_conv2d_s(float *input_x, float *input_w, float *out_y, float *bias, ConvParameter *conv_param, int core_mask)

C调用示例:

 1void TestConvSMCFp32(int* input_shape, int* weight_shape, int* output_shape, int* stride, int* padding, int* dilation, int groups, float* bias, int core_mask) {
 2    int core_id = get_core_id();
 3    int logic_core_id = GetLogicCoreId(core_mask, core_id);
 4    int core_num = GetCoreNum(core_mask);
 5    float* input_data = (float*)0x88000000;
 6    float* weight = (float*)0x89000000;
 7    float* output_data = (float*)0x90000000;
 8    float* bias_data = (float*)0x91000000;
 9    ConvParameter* param = (ConvParameter*)0x92000000;
10    if (logic_core_id == 0) {
11        memcpy(bias_data, bias, sizeof(float) * output_shape[3]);
12        param->dilation_h_ = dilation[0];
13        param->dilation_w_ = dilation[1];
14        param->group_ = groups;
15        param->input_batch_ = input_shape[0];
16        param->input_h_ = input_shape[1];
17        param->input_w_ = input_shape[2];
18        param->input_channel_ = input_shape[3];
19        param->kernel_h_ = weight_shape[1];
20        param->kernel_w_ = weight_shape[2];
21        param->output_batch_ = output_shape[0];
22        param->output_h_ = output_shape[1];
23        param->output_w_ = output_shape[2];
24        param->output_channel_ = output_shape[3];
25        param->stride_h_ = stride[0];
26        param->stride_w_ = stride[0];
27        param->pad_u_ = padding[0];
28        param->pad_l_ = padding[2];
29        param->workspace_ = (float*)0x10000000; // workspace空间需分配在AM内,计算过程中会将数据搬运到workspace空间内进行计算
30    }
31    sys_bar(0, core_num); // 初始化参数完成后进行同步
32    fp_conv2d_s(input_data, weight, output_data, bias_data, param, core_mask);
33}
34
35void main(){
36    int in_channel = 4;
37    int out_channel = 4;
38    int groups = 4;
39    int input_shape[4] = {1, 30, 30, in_channel}; // NHWC
40    int weight_shape[4] = {out_channel, 3, 3, in_channel / groups};
41    int output_shape[4] = {1, 10, 10, out_channel}; // NHWC
42    int stride[2] = {2, 2};
43    int padding[4] = {1, 1, 1, 1};
44    int dilation[2]= {2, 2};
45    float bias[4] = {0, 0, 0, 0};
46    int core_mask = 0b1111;
47    TestConvSMCFp32(input_shape, weight_shape, output_shape, stride, padding, dilation, groups, bias, core_mask);
48}

私有存储版本:

void i8_conv2d_p(int8_t *input_x, int8_t *input_w, int8_t *out_y, int *bias, ConvParameter *conv_param, ConvQuantParameter quant_param, int core_mask)
void hp_conv2d_p(half *input_x, half *input_w, half *out_y, half *bias, ConvParameter *conv_param, int core_mask)
void fp_conv2d_p(float *input_x, float *input_w, float *out_y, float *bias, ConvParameter *conv_param, int core_mask)

C调用示例:

 1void TestConvL2Fp32(int* input_shape, int* weight_shape, int* output_shape, int* stride, int* padding, int* dilation, int groups, float* bias, int core_mask) {
 2    float* input_data = (float*)0x10010000; // 私有存储版本地址设置在AM内
 3    float* weight = (float*)0x10020000;
 4    float* output_data = (float*)0x10030000;
 5    float* bias_data = (float*)0x10040000;
 6    ConvParameter* param = (ConvParameter*)0x10060000;
 7    memcpy(bias_data, bias, sizeof(float) * output_shape[3]);
 8    param->dilation_h_ = dilation[0];
 9    param->dilation_w_ = dilation[1];
10    param->group_ = groups;
11    param->input_batch_ = input_shape[0];
12    param->input_h_ = input_shape[1];
13    param->input_w_ = input_shape[2];
14    param->input_channel_ = input_shape[3];
15    param->kernel_h_ = weight_shape[1];
16    param->kernel_w_ = weight_shape[2];
17    param->output_batch_ = output_shape[0];
18    param->output_h_ = output_shape[1];
19    param->output_w_ = output_shape[2];
20    param->output_channel_ = output_shape[3];
21    param->stride_h_ = stride[0];
22    param->stride_w_ = stride[0];
23    param->pad_u_ = padding[0];
24    param->pad_l_ = padding[2];
25    param->workspace_ = (float*)0x10070000;
26    param->buffer_size_ = 2048; // 私有存储版本中,必须设置该参数,用于确定分块计算的大小
27    fp_conv2d_p(input_data, weight, output_data, bias_data, param, core_mask);
28}
29
30void main(){
31    int in_channel = 4;
32    int out_channel = 4;
33    int groups = 4;
34    int input_shape[4] = {1, 30, 30, in_channel}; // NHWC
35    int weight_shape[4] = {out_channel, 3, 3, in_channel / groups};
36    int output_shape[4] = {1, 10, 10, out_channel}; // NHWC
37    int stride[2] = {2, 2};
38    int padding[4] = {1, 1, 1, 1};
39    int dilation[2]= {2, 2};
40    float bias[4] = {0, 0, 0, 0};
41    int core_mask = 0b0001; // 私有存储版本只能设置为一个核心启动
42    TestConvL2Fp32(input_shape, weight_shape, output_shape, stride, padding, dilation, groups, bias, core_mask);
43}